suppressPackageStartupMessages({
import(rpkgs)
})
import(run)
import(util)
## [1] TRUE
Baseline model predict target = average value for the asset
modelName = "baseline-avg"
assets = getAllAssets()
## 2021-12-01 03:19:56 INFO::Sourcing ALL_ASSETS
runModel = \() {
doRun(
name = modelName,
trnAmt = 60 * 24 * 7 * 1, # 1 week of data, chosen arbitrarily
tstAmt = 60 * 24 * 7 * 2, # 2 weeks, submission period will provide new data every 2 weeks
assets = assets[,asset_id],
makeData = \(env, minDate, maxDate, assets, ...) {
selectStmt = glue('
SELECT ts, asset_id, asset_name, target
FROM trn
WHERE (ts BETWEEN $1 AND $2)
AND asset_id IN ({paste(assets, collapse = ", ")})
')
df = getQuery(selectStmt, params = list(minDate, maxDate))
env$x = df[,.(ts, asset_id, asset_name)]
env$y = df[,.(target)]
},
trainModel = \(model, trn, ...) {
# give the model a description
model$description = 'mean of target'
model$getKeyForAsset = \(a) paste("asset-", a)
for (a in unique(trn$x[,asset_id])) {
idx = trn$x[,asset_id] == a
key = model$getKeyForAsset(a)
prediction = mean(trn$y[idx,target], na.rm = TRUE)
if (is.na(prediction)) prediction = 0
model[[key]] = prediction
}
},
predictModel = \(model, tst, ...) {
# use advanced machine learning algorithm to predict crypto movement
tst$yhat = vector(mode = "numeric", length = nrow(tst$x))
tst$yhat[1:length(tst$yhat)] <- NA
for (a in unique(tst$x[,asset_id])) {
idx = tst$x[,asset_id] == a
key = model$getKeyForAsset(a)
tst$yhat[idx] <- model[[key]]
}
}
)
}
Same method as was used for the baseline “target = 0” model.
numSamples = 610
set.seed(205794)
for (i in 1:numSamples) {
results = runModel()
}
We can examine the results from the last run, as a sanity-check.
df = results$tst$x
df$y = results$tst$y$target
df$yhat = results$tst$yhat
set.seed(68420)
# sample of data
plotStart = sample(df[,ts], 1)
plotEnd = plotStart + as.difftime(200, units = "mins")
assets[sample(nrow(assets), 2),asset_name] |>
lapply(\(asset) {
df[asset_name == asset & ts > plotStart & ts < plotEnd] |>
melt(id.vars = c("ts", "asset_name"), measure.vars = c("y", "yhat")) |>
ggplot(aes(ts, value, colour = variable)) +
geom_line() +
facet_wrap(~asset_name, ncol = 1)
}) |>
print()
## Warning: Removed 7 row(s) containing missing values (geom_path).
The competition metric is correlation between your predictions and the targets.
Visualising this:
## Warning: Removed 62809 rows containing non-finite values (stat_bin2d).
Remember, that’s just for 1 run; we repeated that experiment 610 times!
scores = getQuery('SELECT * FROM metrics WHERE name = $1', params = list(modelName))
DT::datatable(scores[,.(run_id, corr, mae, aae, rmse)])
performancePlot = \(x) {
# https://stackoverflow.com/a/36344354
count = sum(!is.na(x))
mean = mean(x, na.rm = TRUE)
sd = sd(x, na.rm = TRUE)
sem = sd / sqrt(count)
range = (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
breaks = 30
binwidth = range / breaks
labelY = max(hist(x, breaks = breaks, plot = FALSE)$counts)
makePlot = \(plotRange) {
if (plotRange == "relative") {
curveX = linspace(mean - range/2, mean + range/2, 100)
labelX = mean + range/2
} else if (plotRange == "absolute") {
curveX = linspace(-0.05, 1.0, 1000)
labelX = 1.0
}
annFmt = \(n) format(round(n, 5), nsmall=5)
mcorrAnn = annotate(
"text",
label = glue('mean: {annFmt(mean)}\ns.err: {annFmt(sem)}\ns.dev: {annFmt(sd)}'),
family = "monospace",
hjust = "right",
vjust = "top",
x = labelX, y = labelY
)
curveY = dnorm(curveX, mean = mean, sd = sd) * binwidth * count
curveD = data.table(
corr = curveX,
count = curveY
)
ggplot(data.frame(x = x), aes(x)) +
xlab("corr") +
ylab("count") +
geom_histogram(binwidth = binwidth) +
geom_line(data = curveD, aes(x = corr, y = count)) +
geom_vline(xintercept = mean, color = "green") +
geom_vline(xintercept = mean - 2.5*sem, color = "green", alpha = 0.5) +
geom_vline(xintercept = mean + 2.5*sem, color = "green", alpha = 0.5) +
geom_vline(xintercept = mean - 2.5*sd, color = "blue", alpha = 0.5) +
geom_vline(xintercept = mean + 2.5*sd, color = "blue", alpha = 0.5) +
mcorrAnn +
labs(
title = glue('Distribution of corr ({plotRange})')
)
}
results = new.env(parent = .GlobalEnv)
results$relativePlot = makePlot("relative")
results$absolutePlot = makePlot("absolute")
results$count = count
results$mean = mean
results$sd = sd
results$sem = sem
results$range = range
results$binwidth = binwidth
results$shapiro.test = shapiro.test(x)
results$t.test = t.test(x)
class(results) <- append(class(results), "performancePlot")
results
}
format.performancePlot = \(p) {
sw = paste(capture.output(print(p$shapiro.test)), collapse = "\n")
tt = paste(capture.output(print(p$t.test)), collapse = "\n")
glue('performancePlot:
count = {p$count}
mean = {p$mean}
sd = {p$sd}
sem = {p$sem}
range = {p$range}
binwidth = {p$binwidth}
relativePlot = <ggplot plot object>
absolutePlot = <ggplot plot object>
t.test = {tt}')
}
print.performancePlot = \(p, ...) cat(format(p), ...)
p = performancePlot(scores[,corr])
print(p)
## performancePlot:
## count = 610
## mean = 0.00112118099786689
## sd = 0.00900130400299388
## sem = 0.000364452146118459
## range = 0.069905162
## binwidth = 0.00233017206666667
## relativePlot = <ggplot plot object>
## absolutePlot = <ggplot plot object>
## t.test =
## One Sample t-test
##
## data: x
## t = 3.0763, df = 609, p-value = 0.00219
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.0004054455 0.0018369165
## sample estimates:
## mean of x
## 0.001121181
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.97935, p-value = 1.405e-07